---
title: "Analysis"
output:
  html_document:
    code_folding: hide
author: "Ross Dahlke"
date: "`r format(Sys.time(), '%B %d, %Y')`"
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#install.packages("tidyverse")
library(tidyverse)
#install.packages("janitor")
library(janitor) 
options(scipen = 999999)
```


```{r}
start_date <- as.Date("2021-05-01") 
end_date <- as.Date("2022-03-31")
```

# Twitter


```{r}
twitter_by_date <- read_csv("../data/twitter-by-date.csv")
```


## Time-Series Plots


```{r}
date_breaks <- seq.Date(as.Date("2021-01-06"), as.Date("2021-01-06") + 700, "3 months")

twitter_n_plot <- ggplot(twitter_by_date, aes(date, j6_n)) +
  scale_x_date(date_breaks = "2 months", date_labels =  "%b %Y") +
  scale_y_continuous(breaks = seq(0, 30000, 1000), labels = scales::comma_format()) +
  labs(title = "A. Number of Tweets by Elite Political Influencers about January 6",
       x = "Date",
       y = "Number of Posts per Day") +
  geom_vline(aes(xintercept = as.Date("2021-01-06")), linetype = "dashed", color = "grey80") +
  geom_vline(aes(xintercept = as.Date("2022-01-06")), linetype = "dashed", color = "grey80") +
  annotate("text", x = as.Date("2021-03-01"), y=4100, label="January 6, 2021", angle=0, size=4, color="grey20") +
  annotate("text", x = as.Date("2022-03-01"), y=4100, label="January 6, 2022", angle=0, size=4, color="grey20") +
  geom_line() +
  scale_x_date(
    date_labels = "%b %d, %Y",  
    breaks = date_breaks, 
  ) +
  theme_bw()
```

## Non-parametric testing

### Cut dates

```{r}
defendants_table <- read_csv("../data/defendants_table_extracted.csv")
```

```{r}
defendants_table_long <- defendants_table %>% 
  select(-name) %>% 
  pivot_longer(c("arrested", "self_surrendered", "charged", "arraigned", "plea_agreement", "sentenced", "jury_guilty")) %>% 
  filter(!is.na(value)) %>% 
  mutate(date = lubridate::mdy(value)) %>% 
  count(name, date) 

defendants_date <- expand.grid(date = seq.Date(min(defendants_table_long$date), max(defendants_table_long$date), 1),
       name = defendants_table_long %>% distinct(name) %>% pull()) %>% 
  left_join(defendants_table_long) %>% 
  filter(date >= start_date & date <= end_date) %>% 
  mutate(n = replace_na(n, 0)) %>% 
  filter(name == "arrested") %>% 
  filter(date >= start_date & date <= end_date)
```

```{r}
arrests_sd <- defendants_date %>% 
  mutate(mean = mean(n),
         sd = sd(n),
         sd_2 = mean + sd * 2,
         sd_3 = mean + sd * 3,
         sd_4 = mean + sd * 4,
         sd_2_binary = if_else(n > sd_2, 1, 0),
         sd_3_binary = if_else(n > sd_3, 1, 0),
         sd_4_binary = if_else(n > sd_4, 1, 0),
         sd_type = case_when(
           sd_4_binary == 1 ~ "4",
           sd_3_binary == 1 ~ "3",
           sd_2_binary == 1 ~ "2",
           T ~ "other"
         ))
```


```{r}
arrests_cut_dates <- arrests_sd %>% 
  filter(sd_2_binary == 1) %>% 
  pull(date)
```

```{r}
stories <- read_csv("../data/media_cloud_j6_stories.csv")
```

```{r}
stories_sd <- stories %>% 
  mutate(date = lubridate::mdy(date)) %>% 
  filter(date >= start_date & date <= end_date) %>% 
  pivot_longer(cols = c("j6_stories", "j6_stories_per"), values_to = "n") %>% 
  group_by(name) %>% 
  mutate(mean = mean(n),
         sd = sd(n),
         sd_2 = mean + sd * 2,
         sd_3 = mean + sd * 3,
         sd_4 = mean + sd * 4,
         sd_2_binary = if_else(n > sd_2, 1, 0),
         sd_3_binary = if_else(n > sd_3, 1, 0),
         sd_4_binary = if_else(n > sd_4, 1, 0),
         sd_type = case_when(
           sd_4_binary == 1 ~ "4",
           sd_3_binary == 1 ~ "3",
           sd_2_binary == 1 ~ "2",
           T ~ "other"
         ))
```


```{r}
cut_dates_tibble <- stories_sd %>% 
  filter(sd_2_binary == 1) %>% 
  select(date) %>% 
  bind_rows(arrests_sd %>% filter(sd_2_binary == 1) %>% select(date) %>% mutate(name = "arrests")) %>% 
  filter(date >= start_date & date <= end_date) %>% 
  mutate(date = as.Date(date)) %>% 
  filter(name %in% c("j6_stories", "arrests"))
```

```{r}
cut_dates <- cut_dates_tibble %>% 
  ungroup() %>% 
  arrange(date) %>% 
  distinct(date) %>%
  filter(date < as.Date("2022-01-06") - 30 | date > as.Date("2022-01-06") + 30) %>% 
  pull(date)
```

### Testing

Now, the main plot based on these cut dates. 

```{r}
twitter_null_dist <- read_csv("../data/twitter_j6_null_distribution_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists"
  )) 
```

```{r}
twitter_cut_diffs <- read_csv("../data/twitter_cut_diffs_roberta.csv") %>% 
  filter(date %in% cut_dates) %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists"
  )) 
```

```{r}
twitter_non_parametric_testing <- twitter_cut_diffs %>% 
  rename(cut_date = date, cut_diff = diff) %>% 
  left_join(twitter_null_dist) %>% 
  mutate(greater_than_null = if_else(cut_diff > diff, 1, 0),
         less_than_null = if_else(cut_diff < diff, 1, 0)) %>% 
  group_by(window, group, cut_date, dv) %>% 
  summarize(n = n(),
            greater_than_null = sum(greater_than_null),
            less_than_null = sum(less_than_null)) %>% 
  mutate(sign_sig = case_when(
  greater_than_null / n >= .975 ~ "significant + positive",
  less_than_null / n >= .975 ~ "significant + negative",
  T ~ "not significant"
  ))
```

```{r}
twitter_null_dist_cis <- twitter_null_dist %>% 
  filter(window == 30) %>% 
  group_by(group) %>% 
  summarize(low = quantile(diff, .025),
            high = quantile(diff, .975)) %>% 
  pivot_longer(cols = c("low", "high"))
```

```{r}
twitter_cut_diffs_30 <- twitter_cut_diffs %>% 
  filter(window == 30)
```

# Gab

```{r}
gab_by_date <- read_csv("../data/gab-by-date.csv")
```

## Time-Series Plots


```{r}
date_breaks <- seq.Date(as.Date("2021-01-06"), as.Date("2021-01-06") + 700, "3 months")

gab_n_plot <- ggplot(gab_by_date, aes(date, j6_n)) +
  scale_x_date(date_breaks = "2 months", date_labels =  "%b %Y") +
  scale_y_continuous(labels = scales::comma_format()) +
  labs(title = "B. Number of Gab Posts about January 6",
       x = "Date",
       y = "Number of Posts per Day") +
  annotate("text", x = as.Date("2021-03-01"), y=1550, label="January 6, 2021", angle=0, size=4, color="grey20") +
  annotate("text", x = as.Date("2022-03-01"), y=1550, label="January 6, 2022", angle=0, size=4, color="grey20") +
  geom_vline(aes(xintercept = as.Date("2021-01-06")), linetype = "dashed", color = "grey80") +
  geom_vline(aes(xintercept = as.Date("2022-01-06")), linetype = "dashed", color = "grey80") +
  geom_line() +
  scale_x_date(
    date_labels = "%b %d, %Y",  
    breaks = date_breaks, 
  ) +
  theme_bw()
```

## Non-parametric testing

### Testing

```{r}
gab_null_dist <- read_csv("../data/gab_j6_null_distribution_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists",
    group == "created_j6" ~ "Created Account on January 6",
    group == "posted_j6" ~ "Posted About January 6"
  )) 
```

```{r}
gab_cut_diffs <- read_csv("../data/gab_cut_diffs_roberta.csv") %>% 
  filter(date %in% cut_dates) %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists",
    group == "created_j6" ~ "Created Account on January 6",
    group == "posted_j6" ~ "Posted About January 6"
  )) 
```

```{r}
gab_non_parametric_testing <- gab_cut_diffs %>% 
  rename(cut_date = date, cut_diff = diff) %>% 
  left_join(gab_null_dist) %>% 
  mutate(greater_than_null = if_else(cut_diff > diff, 1, 0),
         less_than_null = if_else(cut_diff < diff, 1, 0)) %>% 
  group_by(window, group, cut_date, dv) %>% 
  summarize(n = n(),
            greater_than_null = sum(greater_than_null),
            less_than_null = sum(less_than_null)) %>% 
  mutate(sign_sig = case_when(
  greater_than_null / n >= .975 ~ "significant + positive",
  less_than_null / n >= .975 ~ "significant + negative",
  T ~ "not significant"
  ))
```


```{r}
gab_null_dist_cis <- gab_null_dist %>% 
  filter(window == 30) %>% 
  group_by(group) %>% 
  summarize(low = quantile(diff, .025),
            high = quantile(diff, .975)) %>% 
  pivot_longer(cols = c("low", "high"))
```

```{r}
gab_cut_diffs_30 <- gab_cut_diffs %>% 
  filter(window == 30)
```


# Combined Plots

```{r}
#install.packages("ggpubr")
library(ggpubr)
```


```{r}
ggpubr::ggarrange(twitter_n_plot, gab_n_plot, nrow = 2)

ggsave("../figures/figure-1.pdf")
```

```{r}
min_date <- stories_sd %>%
  filter(name == "j6_stories") %>%
  summarize(min_date = min(date)) %>%
  pull(min_date)

date_breaks = seq.Date(min_date, min_date + 700, "2 months")

stories_plot <- stories_sd %>% 
  filter(name == "j6_stories") %>% 
  ggplot(aes(date, n)) +
  geom_line(alpha = .2) +
  geom_point(data = stories_sd %>% filter(sd_type != "other" & name == "j6_stories"), color = "black", size = 2) +
  scale_x_date(
    date_labels = "%b %d, %Y",  
    breaks = date_breaks, 
  ) +
  geom_vline(aes(xintercept = as.Date("2022-01-06")), linetype = "dashed", color = "grey20") +
  annotate("text", x = as.Date("2022-02-12"), y=210, label="January 6, 2022", angle=0, size=4, color="grey20") +
  theme_bw() +
  labs(title = "A. Media Stories about January 6",
       # subtitle = "Points represent dates that 2 SD or more above mean",
       x = "Date",
       y = "Number of Stories per Day")

arrests_plot <- arrests_sd %>% 
  ggplot(aes(date, n)) +
  geom_line(alpha = .2) +
  geom_point(data = arrests_sd %>% filter(sd_type != "other"), color = "black", size = 2) +
  scale_x_date(
    date_labels = "%b %d, %Y",  
    breaks = date_breaks, 
  ) +
  geom_vline(aes(xintercept = as.Date("2022-01-06")), linetype = "dashed", color = "grey20") +
  annotate("text", x = as.Date("2022-02-12"), y=10.5, label="January 6, 2022", angle=0, size=4, color="grey20") +
  theme_bw() +
  labs(title = "B. Arrests of January 6 Defendants",
       # subtitle = "Points represent dates that 2 SD or more above mean",
       x = "Date",
       y = "Number of Arrests per Day")

ggpubr::ggarrange(stories_plot, arrests_plot, nrow = 2)

ggsave("../figures/figure-2.pdf")
```

```{r fig.height = 8, fig.width = 10}
twitter_null_dist %>%
  filter(window == 30) %>% 
  mutate(platform = "Twitter") %>% 
  bind_rows(gab_null_dist %>% filter(window == 30) %>% mutate(platform = "Gab")) %>% 
  ggplot(aes(diff)) +
  geom_histogram() +
  geom_vline(data = twitter_null_dist_cis %>% mutate(platform = "Twitter"), aes(xintercept = value), size = 1, color = "#440154", linetype = "dashed") +
  geom_vline(data = twitter_cut_diffs_30 %>% mutate(platform = "Twitter"), aes(xintercept = diff), size = .25, color = "#5ec962", alpha = .5) +
  geom_vline(data = gab_null_dist_cis %>% mutate(platform = "Gab"), aes(xintercept = value), size = 1, color = "#440154", linetype = "dashed") +
  geom_vline(data = gab_cut_diffs_30 %>% mutate(platform = "Gab"), aes(xintercept = diff), size = .25, color = "#5ec962", alpha = .5) +
  labs(title = "Null Distribution of Non-Parametric Testing of Change in Volume of Posts +/- 30 Days",
       subtitle = "Dashed line = 95% CI | solid line = focal points",
       x = "Difference in # of Posts",
       y = "# of Days") +
  facet_wrap(platform~group, scales = "free") +
  theme_classic() 

ggsave("../figures/figure-3.pdf")
```

```{r fig.height = 10, fig.width = 10}
twitter_non_parametric_testing %>% 
  mutate(platform = "Twitter") %>% 
  bind_rows(gab_non_parametric_testing %>% mutate(platform = "Gab")) %>% 
  ggplot(aes(window, as.factor(cut_date), fill = sign_sig)) +
  geom_tile() +
  scale_y_discrete(limits=rev) +
  scale_fill_grey(start = 0.8, end = 0.2) +
  labs(title = "Non-Parametric Model Results +/- 5 to 30 days",
       x = "# of days window",
       y = "focal point",
       fill = "Significant") +
  theme_minimal() +
  facet_wrap(platform~group)

ggsave("../figures/figure-window-robustness.pdf")
```

ITSA Results

```{r fig.height = 10, fig.width = 10}
b2 <- read_csv("../data/twitter_its_results_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists"
  )) %>% 
  janitor::clean_names() %>% 
  filter(event_date %in% cut_dates) %>% 
  select(group, event_date, value = b2_value, std_error = b2_std_error, p_value = b2_p_value) %>% 
  mutate(low = value - std_error * 1.96,
         high = value + std_error * 1.96,
         significant = if_else(p_value < .05, "Yes", "No"),
         event_date = fct_rev(as.factor(event_date)),
         coefficient = "beta[2]") 

b3 <- read_csv("../data/twitter_its_results_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists"
  )) %>% 
  janitor::clean_names() %>% 
  filter(event_date %in% cut_dates) %>% 
  select(group, event_date, value = b3_value, std_error = b3_std_error, p_value = b3_p_value) %>% 
  mutate(low = value - std_error * 1.96,
         high = value + std_error * 1.96,
         significant = if_else(p_value < .05, "Yes", "No"),
         event_date = fct_rev(as.factor(event_date)),
         coefficient = "beta[3]") 

my_labeller <- function(labels) {
  labels$coefficient <- lapply(labels$coefficient, function(x) parse(text=x))
  return(labels)
}

b2 %>% 
  bind_rows(b3) %>%
  ggplot(aes(x = event_date, y = value, ymin = low, ymax = high, shape = significant)) +
  geom_hline(yintercept = 0) +
  geom_pointrange() +
  coord_flip() + 
  scale_shape_manual(values = c("No" = 16, "Yes" = 17)) +
  labs(y = "Coefficient",
       x = "Cut Date",
       shape = "p < .05",
       title = "Twitter Interrupted Time Series Analysis Results") +
  facet_wrap(group ~ coefficient, scales = "free", labeller = my_labeller) + 
  theme_bw()

ggsave("../figures/figure-twitter-its.pdf", width = 11, height = 10)
```

```{r fig.height = 10, fig.width = 10}
b2 <- read_csv("../data/gab_its_results_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists",
    group == "created_j6" ~ "Created Account on January 6",
    group == "posted_j6" ~ "Posted About January 6"
  )) %>% 
  janitor::clean_names() %>% 
  filter(event_date %in% cut_dates) %>% 
  select(group, event_date, value = b2_value, std_error = b2_std_error, p_value = b2_p_value) %>% 
  mutate(low = value - std_error * 1.96,
         high = value + std_error * 1.96,
         significant = if_else(p_value < .05, "Yes", "No"),
         event_date = fct_rev(as.factor(event_date)),
         coefficient = "beta[2]") 

b3 <- read_csv("../data/gab_its_results_roberta.csv") %>% 
  mutate(group = case_when(
    group == "group_christian_constitutionalists" ~ "Christian Constitutionalists",
    group == "group_conservative_media" ~ "Conservative Media",
    group == "group_number_maga" ~ "#MAGA",
    group == "group_tea_party_conservatives" ~ "Tea Party Conservatives",
    group == "group_white_nationalists" ~ "White Nationalists",
    group == "created_j6" ~ "Created Account on January 6",
    group == "posted_j6" ~ "Posted About January 6"
  )) %>% 
  janitor::clean_names() %>% 
  filter(event_date %in% cut_dates) %>% 
  select(group, event_date, value = b3_value, std_error = b3_std_error, p_value = b3_p_value) %>% 
  mutate(low = value - std_error * 1.96,
         high = value + std_error * 1.96,
         significant = if_else(p_value < .05, "Yes", "No"),
         event_date = fct_rev(as.factor(event_date)),
         coefficient = "beta[3]") 

my_labeller <- function(labels) {
  labels$coefficient <- lapply(labels$coefficient, function(x) parse(text=x))
  return(labels)
}

b2 %>% 
  bind_rows(b3) %>%
  ggplot(aes(x = event_date, y = value, ymin = low, ymax = high, shape = significant)) +
  geom_hline(yintercept = 0) +
  geom_pointrange() +
  coord_flip() + 
  scale_shape_manual(values = c("No" = 16, "Yes" = 17)) +
  labs(y = "Coefficient",
       x = "Cut Date",
       shape = "p < .05",
       title = "Gab Interrupted Time Series Analysis Results") +
  facet_wrap(group ~ coefficient, scales = "free", labeller = my_labeller) + 
  theme_bw()

ggsave("../figures/figure-gab-its.pdf", width = 12, height = 16)
```



